#===============================================================================
#  File:    LSS.R
#  Date:    December, 2021 
#  Author:  Natalia Umansky
#  Purpose: Fitting the LSS model and estimating the security score of each tweet
#===============================================================================

# LIBRARIES
#===============================================================================
require(quanteda)
require(LSX)
library(data.table)
library(rtweet)
library(stringr)
library(stringi)
library(qdapRegex)
library(tidyverse)
library(kableExtra)
library(ggplot2)
library(cowplot)

quanteda_options(threads = 8)

# IMPORT DATA
#===============================================================================

Groups <- c("Politicians", "Media", "Citizens", "Advocates", "Friends")

for (i in Groups){
  df <- fread(paste0("~/",i,".csv"), 
              select = c("user_id", "status_id", "created_at", "screen_name", "text", "hashtags"))
  assign(i, df)
}

ALL <- rbind(Media, Politicians, Citizens, Advocates, Friends)


# FUNCTIONS
#===============================================================================

# Cleaning text

cleaning <- function(x){
  corp <- corpus(x)
  
  tokens<- corp %>% 
    #corpus_reshape("sentences") %>% 
    tokens(remove_punct = TRUE) %>% 
    tokens_select("^[0-9a-zA-Z]+$", valuetype = "regex")  %>% 
    tokens_tolower() %>% 
    tokens_compound(pattern = list(c("no", "safe"), c("not", "safe"),
                                   c("no", "normal"), c("not", "normal"),
                                   c("no", "peace"), c("not", "peace"),
                                   c("new", "zealand"), c("united", "states"),
                                   c("united", "kingdom"), c("north", "korea"),
                                   c("sri", "lanka"), c("el", "paso"),
                                   c("no", "danger"), c("no", "harm"),
                                   c("not", "an", "emergency"),
                                   c("strong", "ties"),
                                   c("existential", "threat"),
                                   c("existential", "threats"),
                                   c("not", "a", "threat"),
                                   c("existential", "crisis"),
                                   c("deeply", "concearned"),
                                   c("take", "action"),
                                   c("under", "attack"),
                                   c("out", "of", "danger"))) 
  
  
  tweets.dfm <- dfm(tokens, 
                    tolower=T,
                    stem = F,
                    remove = stops,
                    remove_punct=T,
                    remove_numbers =T,
                    remove_symbols = T,
                    remove_hyphens = T,
                    verbose = T,
                    include_docvars =T)
  
  tweets.dfm <- tweets.dfm %>%
    dfm_trim(min_termfreq = 10, termfreq_type = "count",
             min_docfreq = .001, docfreq_type = "prop", verbose=T) %>%
    dfm_select(min_nchar = 2, selection = "keep")
  
  return(tweets.dfm)
}


# Predicting LSS values for each document

pred_function <- function(x, y){
  pred <- as.data.frame(predict(tmod_lss, se.fit =T, newdata = x))
  pred$date <- as.Date(docvars(x, "created_at"))
  pred$screen_name <- docvars(x, "screen_name")
  pred$status_id <- docvars(x, "status_id")
  pred$text <- y$text
  pred$is_retweet <- docvars(x, "is_retweet")
  pred$select <- (pred$fit - pred$se.fit) > 0
  return(pred)
}

scaled_table <- function(x, n, file_name){
  
  head <- as.data.frame(head(coef(x), n))
  
  setDT(head, keep.rownames = TRUE)[]
  
  tail <- as.data.frame(tail(coef(x), n))
  
  setDT(tail, keep.rownames = TRUE)[]
  
  table <- cbind(head, tail)
  
  write.table(table, file = file_name, append = FALSE, sep = ",",
              dec = ".", row.names = TRUE,
              col.names = TRUE,
              fileEncoding = "UTF-8")
  return(table)
}


# FITTING THE LSS MODEL
#===============================================================================
stops <- c("i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
           "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
           "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
           "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would",
           "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've",
           "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll",
           "we'll", "they'll", "let's", "that's",  "who's","what's","here's","there's","when's","where's","why's","how's",
           "a","an","the","and","but","if","or", "because","as", "until","while","of","at","by","for","with","about","against",
           "between","into","through","during", "before","after","above","below","to", "from","up","down", "in","out",
           "on","off","over","under", "again","further","then", "once", "here", "there","when","where","why", "how",
           "all","any","both","each","few","more","most", "other", "some", "such", "only","own","same","so", "than",
           "too","very","will", "im", "youre", "hes", "shes", "its", "were", "theyre", "ive", "youve",
           "weve", "theyve", "id", "youd", "hed", "shed", "wed", "theyd", "youll", 
	   "theyll", "lets", "thats",  "whos","whats","heres","theres","whens","wheres","whys","hows", "the", "to",
           "can", "will", "rt", "ones", "hell")


ALL$text <- ALL$text %>%
  stri_replace_all_fixed(pattern= "@\\w+*", "")%>%
  rm_twitter_url() %>%
  str_replace_all("amp", "") %>%
  str_remove_all('[[:punct:]]')%>%
  str_replace_all("[^\x01-\x7F]", "") %>%  #removes emojis
  str_replace_all('[[:digit:]]+', "") %>% 
  str_replace_all("@", "") %>% 
  str_replace_all("#", "")



chunk <- 100000
n <- nrow(ALL)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(ALL, r)
names <-vector()
for (i in 1:length(d)){
  names <- append(names, paste0("All_", i))
}
names(d) <- names

dfmt = list()
for (i in d){
  dfmt <- append(dfmt,list(cleaning(i)))
}

tweets.dfm <- do.call("rbind",dfmt)


seeds <- list(c(
  "crisis", "dangerous",
  "harm", "fear",
  "tense", "hostile",
  "danger", "aggression",
  "weapon", "force",
  "genocide", "war",
  "destruction", "threat",
  "enemy", "attack",
  "protect", "security",
  "insecurity", 
  "emergency", "devastation",
  "hostile", "hostility",
  "victim", "escalation",
  "deterrence", "combat",
  "fight", "arms",
  "defense", "terror",
  "unsafe", "violence",
  "crises", "fears",
  "dangers", "aggressions",
  "weapons", "genocides",
  "wars", "threats",
  "enemies", "attacks",
  "emergencies", "victims",
  "fights", "warefare",
  "insurgent", "insurgents",
  "protect", "hazard", "hazards",
  "no_peace","not_peace",
  "no_safe", "not_safe",
  "no_normal", "not_normal",
  "existential_threat", "existential_threats",
  "existential_crisis",
  "annihilation", "annihilate", "annihilations",
  "destroy", "dangerous",
  "destroyed", "apocalypse",
  "deeply_concearned", "threatening",
  "threatens", "extinction", "alarming",
  "destruction", "harm", "ravaging",
  "ravage", "alarm", "alarmed",
  "crime", "crimes", "urgency", "decimate",
  "catastrophe", "catastrophic", "urgencies",
  "outcry", "slaughter", "illegal", "decimates",
  "slaughters",
  "terrifying", "collaps", "collapsing",
  "take_action", "tragedy", "tragic", "tragedies",
  "survival", "disaster", "under_attack", "disasters",
  "dramatic", "carnage", "urgent", "urgency", "carnages",
  "critical", "invasion", "invasions", "conflict", "conflicts"),
  c(
    "normal", "safe",
    "harmless", "appeasement",
    "concessions", "peace",
    "peaceful", "freedom",
    "friendly", "resolution",
    "calm", "stable",
    "democratic",
    "unity", 
    "agreement", "relax",
    "compromise", "prosperity",
    "equality", "dialogue",
    "commitment",
    "sustainable", "progress",
    "diversity",
    "shared", "promote",
    "agreements", "compromises",
    "dialogues", "commitments",
    "concessions", "conciliation",
    "harmony", "accord",
    "accords", "treaty",
    "treaties",
    "no_danger", "no_harm",
    "no_threat", "not_an_emergency",
    "strong_ties", "not_a_threat",
    "friend", "friends", "calmness",
    "ease", "success", "successful",
    "build", "develop", "development",
    "growth", "improvement", "healthy",
    "agreeable", "pleasant", "kind", "kindness",
    "public", "everyday", "usual",
    "daily", "habitual", "harmony",
    "cessation", "concord", "neutral",
    "friendship", "order", "reliable",
    "stable", "solved", "out_of_danger",
    "guarded", "preserved", "secure", "vindicated",
    "truce", "amity", "restful",
    "lasting", "relaxation", "cooperate",
    "cooperation", "affinity", "community",
    "innocuous", "collaboration", "partnership",
    "collaborations", "partnerships"
  ))

tmod_lss<- textmodel_lss(tweets.dfm, seeds = as.seedwords(seeds), cache = TRUE, auto_weight = FALSE)
table <- scaled_table(tmod_lss, 100, "scale_table.csv")

#TEXT PLOT####################################################################################

scaled_textplot <- textplot_scale1d(tmod_lss, highlighted = unlist(seeds), highlighted_color = "black")+
  geom_vline(xintercept = 0, colour= "red", linetype="dotted")

ggsave("Figure-4.pdf", scaled_textplot, width = 7, height = 5)

#PREDICTING SECURITY SCORE FOR EACH TWEET
#===============================================================================
for (i in Groups){
  eval(parse(text=i))$text %>%
    stri_replace_all_fixed(pattern= "@\\w+*", "")%>%
    rm_twitter_url() %>%
    str_replace_all("amp", "") %>%
    str_remove_all('[[:punct:]]')%>%
    str_replace_all("[^\x01-\x7F]", "") %>%  #removes emojis
    str_replace_all('[[:digit:]]+', "") %>% 
    str_replace_all("@", "") %>% 
    str_replace_all("#", "")
  
  chunk <- 100000
  n <- nrow(eval(parse(text=i)))
  r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
  d <- split(eval(parse(text=i)), r)
  
  pred = list()
  for (j in d){
    dfmt <- cleaning(j)
    pred<- append(pred,list(pred_function(dfmt, j)))
  }
  
  pred <- do.call("rbind",pred)
  assign(paste0("pred_",tolower(i)), pred)

}


# OUTDATA
#===============================================================================
save_as_csv(pred_politicians, file_name = "pred_politicians.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8" )
save_as_csv(pred_media, file_name = "pred_media.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8" )
save_as_csv(pred_citizens, file_name = "pred_citizens.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8" )
save_as_csv(pred_advocates, file_name = "pred_advocates.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8" )
save_as_csv(pred_friends, file_name = "pred_friends.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8" )


